In [1]:
# Here are the imports we'll need for this notebook
import pandas as pd
from nltk import FreqDist
In [2]:
# this will execute and show the output from
# all code cells of the specified notebook
# This might take a couple minutes to process
%run ./CleanText.ipynb
In [3]:
# Let's open the df from CleanText
%store -r df
In [4]:
# And take a quick look at what we've got
df.head()
Out[4]:
In [5]:
#Let's see what hashtags are most common
#As we can see from above, we'll need a way to extract individual hashtags from tweets that used multiple hashtags
def extract_hashtags(hashtag):
return re.findall(r'"([^"]*)"', hashtag)
In [6]:
#Now let's run through the dataframe and put all the hashtags into one list
tags = [extract_hashtags(df['hashtags'][x]) for x in range(len(df)) if len(df['hashtags'][x]) != 2] # i.e. []
In [7]:
#Now we have a list of lists. Let's flatten it into a single list
all_hastags = [item for sublist in tags for item in sublist]
In [8]:
#OK, now let's create a frequency distribution of all the hashtags and plot it
fdist = FreqDist(all_hastags)
fdist.plot(10)
This plot looks roughly like what I'd expect, with one exception. #MuslimBanprotest seems to be someone opposed to Trump. Let's look at this in more detail to see if we have a false positive
In [9]:
#Let's make a df with tweets that could be false positives
poss_false_positive = df[df['hashtags'].str.contains('MuslimBanprotest')]
poss_false_positive.head()
Out[9]:
In [10]:
pd.set_option('display.max_colwidth', -1) #This tells pandas to make the column width the size of the lagest column, so nothing is lost
poss_false_positive['message'].head(20)
Out[10]:
In [11]:
# Lots of retweets. Let's try to filter those out.
poss_false_positive[poss_false_positive['retweet'] == 'N']['message'].head(20)
Out[11]:
Hmm, tweets are messy